# R script for reproducing tables and graphs shown in the BJPZ paper
# Step III: Converting estimates into tabular and graphical outputs

# START SCRIPT

rm(list = ls())

# Load libs ---------------------------------------------------------------

if (!require('tidyverse')) install.packages('tidyverse')
if (!require('readxl')) install.packages('readxl')
if (!require('writexl')) install.packages('writexl')

# Table 1: National totals ------------------------------------------------

## Read national excess death estimates ----

# The Following estimates are directly taken from J. D. Hacker's paper,
# 'A Census-Based Count of the Civil War Dead', 2021, Civil War History, 57(4): 307-348,
# Table 5 (p.333) & Table 8 Panel II (p.346)
tot_nat_1pct <- tibble::tribble(
  ~type, ~one_pct, 
  'nativeborn whites', '538842',
  'nativeborn whites bounds', '(450561 -- 627124)',
  'total deaths', '751562',
  'total deaths bounds', '(632115 -- 871009)'
)

est_full_tot <- readxl::read_xlsx('EST_excdeaths_national.xlsx')
tot_nat_full <- tibble::tribble(
  ~type, ~full_count, 
  'nativeborn whites', as.character(round(est_full_tot$Average[est_full_tot$Sample=='Native White Males Only' & est_full_tot$`Age Group`=='Total'])),
  'nativeborn whites bounds', paste0(
    "(",
    round(est_full_tot$`1850-1860`[est_full_tot$Sample=='Native White Males Only' & est_full_tot$`Age Group`=='Total']),
    " -- ",
    round(est_full_tot$`1870-1880`[est_full_tot$Sample=='Native White Males Only' & est_full_tot$`Age Group`=='Total']),
    ")"
  ),
  'total deaths', as.character(round(est_full_tot$Average[est_full_tot$Sample=='Including Black Deaths'])),
  'total deaths bounds', paste0(
    "(",
    round(est_full_tot$`1850-1860`[est_full_tot$Sample=='Including Black Deaths']),
    '--',
    round(est_full_tot$`1870-1880`[est_full_tot$Sample=='Including Black Deaths']),
    ")"
  )
)

## Tabulate national excess death estimates ----

# The table printed below was then manually typed into a LaTeX table
tot_nat_compare <- merge(tot_nat_1pct, tot_nat_full, by = 'type')
print(tot_nat_compare) 
# type                          one_pct         full_count
# nativeborn whites             538842             496332
# nativeborn whites bounds (450561 -- 627124) (462273 -- 530391)
# total deaths                  751562             698000
# total deaths bounds      (632115 -- 871009)   (647439--748561)

# Table 2: Regional totals ------------------------------------------------

## Read state-level excess death estimates ----

read_estimates_by_baseline <- function(filename, sheetnames = paste(c("1850-60", "1870-80", "Average"), "Baseline")) {
  purrr::map(sheetnames, ~readxl::read_xlsx(path = filename, sheet = .x)) |>
    `names<-`(sheetnames)
}

death_migadj_state = read_estimates_by_baseline("EST_excdeaths_count_bystate.xlsx")

## Compute region-level excess death estimates ----

add_regions_and_subregions <- function(df) {
  df |>
    dplyr::mutate(
      SUBREGION = case_when(
        STATE %in% c("Illinois","New Hampshire","Indiana", 
                     "Maine","Massachusetts", "Michigan", 
                     "New York","Ohio","Connecticut","Vermont","Pennsylvania",
                     "New Jersey","Rhode Island") ~ "Old North",
        STATE %in% c("Wisconsin","Iowa") ~ "New North",
        STATE %in% c("California","Oregon","Minnesota","Kansas","Nevada") ~ "Northern Frontiers",
        STATE %in% c("West Virginia","Delaware","Kentucky","Maryland","Missouri") ~ "Border States",
        STATE %in% c("Louisiana", "Georgia", "South Carolina","Alabama","North Carolina","Mississippi","Virginia","Tennessee") ~ "Old South",
        STATE %in% c("Texas","Arkansas","Florida") ~ "New South",
        TRUE ~ NA_character_
      )
    ) |>
    dplyr::mutate(
      REGION = case_when(
        SUBREGION %in% c("Old North", "New North", "Northern Frontiers") ~ "The North (Union)",
        SUBREGION %in% c("Border States") ~ "Border States",
        SUBREGION %in% c("Old South", "New South") ~ "The South (Confederacy)"
      )
    ) |>
    dplyr::select(-GROUP) |>
    dplyr::select(STATE, REGION, SUBREGION, everything()) |>
    dplyr::mutate(
      REGION = factor(REGION, levels = c("The North (Union)", "Border States", "The South (Confederacy)")),
      SUBREGION = factor(SUBREGION, levels = c("Old North", "New North", "Northern Frontiers", "Border States", "Old South", "New South"))
    ) |>
    dplyr::arrange(REGION, SUBREGION)
}
compute_deaths_by_region <- function(df) {
  df |>
    dplyr::group_by(REGION, SUBREGION) |>
    dplyr::summarise(
      POP = sum(POP),
      TOTAL = sum(TOTAL)
    ) |>
    dplyr::ungroup() |>
    dplyr::mutate(PCT = TOTAL/POP)
}
compute_prepost_bounds <- function(df_pre, df_post) { # making intervals around the average estimates
  if (!all.equal(df_pre[["REGION"]], df_post[["REGION"]])) stop("Regions not matched properly!")
  if (!all.equal(df_pre[["SUBREGION"]], df_post[["SUBREGION"]])) stop("Subregions not matched properly!")
  tot_pop = rep("", length(df_pre[["SUBREGION"]]))
  tot_pop[df_post[["SUBREGION"]]=="Northern Frontiers"] <- paste0("(",prettyNum(df_post[["POP"]][df_post[["SUBREGION"]]=="Northern Frontiers"], big.mark = ","), ")") # this contains Kansas and Nevada
  tot_death = paste0(
    "(", 
    prettyNum(round(df_pre[["TOTAL"]]), big.mark = ","), 
    " -- ", 
    prettyNum(round(df_post[["TOTAL"]]), big.mark = ","),
    ")"
  )
  tot_rate = paste0(
    "(", 
    scales::label_percent(.1)(df_pre[["TOTAL"]]/df_pre[["POP"]]), 
    " -- ", 
    scales::label_percent(.1)(df_post[["TOTAL"]]/df_post[["POP"]]),
    ")"
  )
  return(list(tot_pop=tot_pop, tot_death=tot_death, tot_rate=tot_rate))
}

native_deaths_by_region <- purrr::map(death_migadj_state, add_regions_and_subregions)

native_deaths_by_region_total <- native_deaths_by_region |>
  purrr::map(function(df) dplyr::filter(df, ! STATE%in% c("Kansas", "Nevada"))) |> # drop these two from northern frontier
  purrr::map(compute_deaths_by_region)

native_deaths_by_region_intervals = compute_prepost_bounds(
  native_deaths_by_region_total$`1850-60 Baseline`, 
  native_deaths_by_region_total$`1870-80 Baseline`)

all.equal(
  native_deaths_by_region_total$`Average Baseline`$`SUBREGION`,
  native_deaths_by_region_total$`1850-60 Baseline`$`SUBREGION`
) # TRUE

native_deaths_by_region_avgs = list(
  tot_pop = prettyNum(round(native_deaths_by_region_total$`Average Baseline`$`POP`), big.mark = ","),
  tot_death = prettyNum(round(native_deaths_by_region_total$`Average Baseline`$`TOTAL`), big.mark = ","),
  tot_rate = scales::label_percent(.1)(native_deaths_by_region_total$`Average Baseline`$`TOTAL`/native_deaths_by_region_total$`Average Baseline`$`POP`)
)

TOTnative_deaths_by_region <- native_deaths_by_region_total |>
  lapply(function(df) dplyr::filter(df, SUBREGION %in% c('Old North', 'Old South', 'Border States'))) |>
  lapply(function(df) dplyr::summarise(df, tot_pop = sum(POP), tot_death = sum(TOTAL), tot_rate = tot_death/tot_pop))

TOTnative_deaths_by_region_avg <- data.frame(
  'tot_pop' = prettyNum(round(TOTnative_deaths_by_region$`Average Baseline`$tot_pop), big.mark = ","),
  'tot_death' = prettyNum(round(TOTnative_deaths_by_region$`Average Baseline`$tot_death), big.mark = ","),
  'tot_rate' = scales::label_percent(.1)(TOTnative_deaths_by_region$`Average Baseline`$tot_rate)
) 

TOTnative_deaths_by_region_intervals <- data.frame(
  'tot_pop' = ' ',
  'tot_death' = paste0(
    "(", 
    prettyNum(round(TOTnative_deaths_by_region$`1850-60 Baseline`$tot_death), big.mark = ","), 
    " -- ", 
    prettyNum(round(TOTnative_deaths_by_region$`1870-80 Baseline`$tot_death), big.mark = ","),
    ")"
  ),
  'tot_rate' = paste0(
    "(", 
    scales::label_percent(.1)(TOTnative_deaths_by_region$`1850-60 Baseline`$tot_rate), 
    " -- ", 
    scales::label_percent(.1)(TOTnative_deaths_by_region$`1870-80 Baseline`$tot_rate),
    ")"
  )
)

## Tabulate region-level excess death estimates ----

tot_reg_avg <- as.data.frame(native_deaths_by_region_avgs)[c(1, 4, 5),]
tot_reg_bound <- as.data.frame(native_deaths_by_region_intervals)[c(1, 4, 5),]

tot_reg_df <- rbind.data.frame(
  tot_reg_avg[1,], tot_reg_bound[1,],
  tot_reg_avg[2,], tot_reg_bound[2,],
  tot_reg_avg[3,], tot_reg_bound[3,],
  TOTnative_deaths_by_region_avg,
  TOTnative_deaths_by_region_intervals
) |>
  `rownames<-`(
    c('Old North', 'ON Bounds', 'Border States', 'BS Bounds', 'Old South', 'OS Bounds',
      'Total', 'Total Bounds')
  )

# The table printed below was then manually typed into a LaTeX table
print(tot_reg_df)
#                 tot_pop            tot_death        tot_rate
# Old North     4,669,627              229,803            4.9%
# ON Bounds               (155,943 -- 303,664)  (3.3% -- 6.5%)
# Border States   942,787               38,069            4.0%
# BS Bounds                 (25,456 -- 50,682)  (2.7% -- 5.4%)
# Old South     1,471,203              192,160           13.1%
# OS Bounds               (129,269 -- 255,051) (8.8% -- 17.3%)
# Total         7,083,617              460,032            6.5%
# Total Bounds            (310,668 -- 609,396)  (4.4% -- 8.6%)

# Figure 1: State death rates ---------------------------------------------

## Read state-level excess mortality rate estimates ----

pctdead_migadj_state = read_estimates_by_baseline("EST_excdeaths_rate_bystate.xlsx")

## Prepare tidy data format for plotting ----

abbreviate_old_states <- function(states) {
  state_abbreviations <- c(
    "South Carolina" = "SC",
    "Louisiana" = "LA",
    "Georgia" = "GA",
    "North Carolina" = "NC",
    "Alabama" = "AL",
    "Illinois" = "IL",
    "Mississippi" = "MS",
    "Virginia" = "VA",
    "New Hampshire" = "NH",
    "West Virginia" = "WV",
    "Delaware" = "DE",
    "Tennessee" = "TN",
    "Maryland" = "MD",
    "Ohio" = "OH",
    "Maine" = "ME",
    "Michigan" = "MI", # HZ 2024-06-11: add MI to old north
    "Kentucky" = "KY",
    "New York" = "NY",
    "Massachusetts" = "MA",
    "Connecticut" = "CT",
    "Indiana" = "IN",
    "Missouri" = "MO",
    "Vermont" = "VT",
    "Pennsylvania" = "PA",
    "Rhode Island" = "RI",
    "New Jersey" = "NJ"
  )
  abbreviated_states <- unname(sapply(states, function(state) state_abbreviations[state]))
  return(abbreviated_states)
}

states_analyzed = native_deaths_by_region$`1870-80 Baseline` |>
  dplyr::filter(SUBREGION %in% c('Old North', 'Old South', 'Border States')) |>
  dplyr::pull(STATE)

pctdead_plot_df <- pctdead_migadj_state |>
  purrr::map(
    function(df) df|>
      dplyr::select(STATE, TOTAL, GROUP) |>
      dplyr::filter(STATE %in% states_analyzed) |>
      dplyr::mutate(
        GROUP = case_when(
          GROUP=="Union" ~ "North/Union",
          GROUP=="Confederate"~"South/Confederacy",
          GROUP=="Border"~"Border States"
        )
      )
  ) |>
  dplyr::bind_rows(.id = "baseline") |>
  dplyr::mutate(baseline = factor(
    baseline, levels = c("1850-60 Baseline", "1870-80 Baseline", "Average Baseline"))
  ) |>
  dplyr::group_by(GROUP, baseline) |>
  dplyr::arrange(-TOTAL) |>
  dplyr::mutate(rank = 1:n()) |>
  dplyr::ungroup() |>
  dplyr::mutate(GROUP = factor(GROUP, levels = c("North/Union","Border States", "South/Confederacy"))) |>
  dplyr::mutate(state_abbr = abbreviate_old_states(STATE)) 

pctdead_plot_df2 <-  pctdead_plot_df |>
  tidyr::pivot_wider(id_cols = c(STATE,GROUP,state_abbr), names_from = baseline, values_from = TOTAL) |>
  dplyr::left_join(
    pctdead_plot_df |>
      dplyr::filter(baseline == "Average Baseline") |>
      dplyr::select(STATE, rank),
    by = "STATE") |>
  dplyr::group_by(GROUP) |>
  dplyr::ungroup() |>
  dplyr::mutate(
    coord_statename = ifelse(`1870-80 Baseline`>`1850-60 Baseline`, `1870-80 Baseline`, `1850-60 Baseline`)
  ) |>
  dplyr::mutate(
    GROUP2 = case_when(
      GROUP=="North/Union" ~ "Northern States",
      GROUP=="Border States" ~ "Border States",
      GROUP=="South/Confederacy" ~ "Southern States"
    ) |>
      factor(levels = c("Northern States", "Border States", "Southern States"))
  )

## Plot bar chart of state-levle excess death rate estimates ----

ggplot(pctdead_plot_df2, 
       aes(x = `Average Baseline`, y = reorder(STATE, rank), 
           color = GROUP, fill = GROUP, shape = GROUP)) +
  geom_col(aes()) +
  geom_errorbarh(aes(xmin = `1850-60 Baseline`, xmax = `1870-80 Baseline`), color = "white", height=.25, size = 1.2) +
  geom_errorbarh(aes(xmin = `1850-60 Baseline`, xmax = `1870-80 Baseline`), height=.2) +
  geom_point(color = "white", size = 3, show.legend = FALSE) +
  geom_point(show.legend = FALSE) +
  geom_text(aes(x = coord_statename + 0.0075, 
                label = paste0(STATE, " (", round(100*`Average Baseline`, 2), "%", ")")),
            size = 5, hjust = "left",
            fontface = "bold",
            show.legend = FALSE
  ) +
  facet_grid(rows = vars(GROUP2), scales = "free_y", space = "free_y") +
  scale_fill_manual(breaks = c("North/Union","Border States", "South/Confederacy"),
                    labels = c("Old North", "Border States", "Old South"),
                    values = c("#2f8ac2","#818387", "#a94a28")) +
  scale_color_manual(breaks = c("North/Union","Border States", "South/Confederacy"),
                     labels = c("Old North", "Border States", "Old South"),
                     values = c("#2f8ac2","#818387", "#a94a28")) +
  scale_shape_manual(breaks = c("North/Union","Border States", "South/Confederacy"),
                     labels = c("Old North", "Border States", "Old South"),
                     values = c(19, 5, 18)) +
  scale_x_continuous(labels = scales::label_percent(),
                     limits = c(-0.059, 0.35)) +
  theme_minimal(base_size = 14) +
  labs(x = "Excess Mortality Rate (%)", y = NULL, color = NULL, fill = NULL) +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    strip.text = element_text(size = 16),
    axis.text.x = element_text(size = 15),
    axis.title.x = element_text(size = 15),
    legend.position = c(0.85, 0.9),
    legend.text = element_text(size = 15),
    legend.background = element_rect(colour = "black", linewidth = 0.5),
    legend.key.size = unit(1.3, "line"),
    legend.key.spacing.y = unit(.8, "line")
  )

# END SCRIPT HERE
